This notebook shows how to do a logistic regression using pandas and statsmodels to predict whether titanic passengers will survive!
I used this nice tutorial as a reference.
Let's start by firing up the ipython notebook and loading the data. When you're doing data analysis you'll often find that you spend a lot more time than you would have guessed inspecting and cleaning the data.
# Set some ipython defaults
%pylab inline
rcParams['figure.figsize'] = 8, 6 # that's default image size for this interactive session
font = {'family' : 'normal',
'weight' : 'bold',
'size' : 16}
rc('font', **font)
Populating the interactive namespace from numpy and matplotlib
# first let's load up the data and look at it
import pandas as pd # used to keep data looking pretty
import numpy as np
# load the raw_data first
raw_data = pd.read_csv('titanic.csv')
raw_columns = raw_data.columns
# We'll also re-index
raw_data.index = raw_data['PassengerId']
# and we'll take all of our training data and put it into the "real_train" index
real_train = raw_data.index
# now let's load the test data set
test_data = pd.read_csv('titanic_test.csv')
# We'll make the survival NA for the test data set
test_data['Survived'] = np.NaN
test_data.index = test_data['PassengerId']
real_test = test_data.index
# Now let's merge the dataframes
# I'm going to merge them together for convenience so that any transformations I make are applied to both data sets
data = pd.concat([raw_data, test_data])
# remove raw training and testing data
del raw_data
del test_data
# Show the first 5 rows
print(data.head())
Age Cabin Embarked Fare \ PassengerId 1 22 NaN S 7.2500 2 38 C85 C 71.2833 3 26 NaN S 7.9250 4 35 C123 S 53.1000 5 35 NaN S 8.0500 Name Parch \ PassengerId 1 Braund, Mr. Owen Harris 0 2 Cumings, Mrs. John Bradley (Florence Briggs Th... 0 3 Heikkinen, Miss. Laina 0 4 Futrelle, Mrs. Jacques Heath (Lily May Peel) 0 5 Allen, Mr. William Henry 0 PassengerId Pclass Sex SibSp Survived Ticket PassengerId 1 1 3 male 1 0 A/5 21171 2 2 1 female 1 1 PC 17599 3 3 3 female 0 1 STON/O2. 3101282 4 4 1 female 1 1 113803 5 5 3 male 0 0 373450
# If I want to look at just the training data set, I can do that too
print(data.ix[real_train].describe())
Age Fare Parch PassengerId Pclass SibSp Survived count 714.000000 891.000000 891.000000 891.000000 891.000000 891.000000 891.000000 mean 29.699118 32.204208 0.381594 446.000000 2.308642 0.523008 0.383838 std 14.526497 49.693429 0.806057 257.353842 0.836071 1.102743 0.486592 min 0.420000 0.000000 0.000000 1.000000 1.000000 0.000000 0.000000 25% 20.125000 7.910400 0.000000 223.500000 2.000000 0.000000 0.000000 50% 28.000000 14.454200 0.000000 446.000000 3.000000 0.000000 0.000000 75% 38.000000 31.000000 0.000000 668.500000 3.000000 1.000000 1.000000 max 80.000000 512.329200 6.000000 891.000000 3.000000 8.000000 1.000000
# Check that we've gotten rid of all the NaNs or other bad values
def is_bad(series):
"""Returns a binary series indicating whether there are any bad values (null, +/- inf, or '')
input: series
return: Boolean series
"""
return pd.isnull(series) | series.apply(lambda x: (x == np.inf or x == -np.inf or x == ''))
print(data.apply(is_bad).sum())
Age 263 Cabin 1014 Embarked 2 Fare 1 Name 0 Parch 0 PassengerId 0 Pclass 0 Sex 0 SibSp 0 Survived 418 Ticket 0 dtype: int64
Crap, there's a lot of missing values! Let's fix Age, Embarked, and Fare.
# Now let's fix the missing embarked data
# first we'll see what the most common embarkment types are:
print(data.groupby('Embarked').count()['PassengerId'])
Embarked C 270 Q 123 S 914 Name: PassengerId, dtype: int64
# So let's fill the missing values with S, which is by far the most commong embarkation point
data['Embarked'] = data['Embarked'].fillna('S')
print(data.groupby('Embarked').count()['PassengerId'])
Embarked C 270 Q 123 S 916 Name: PassengerId, dtype: int64
print(data[is_bad(data['Fare'])])
Age Cabin Embarked Fare Name Parch \ PassengerId 1044 60.5 NaN S NaN Storey, Mr. Thomas 0 PassengerId Pclass Sex SibSp Survived Ticket PassengerId 1044 1044 3 male 0 NaN 3701
# and let's fix the missing fare value. While I'm at it, I'll also nudge any zero values so that I can later take the logarithm
# Other than being old, the guy doesn't seem unusual, so I'll just use the median fare of his class and gender
data['fFare'] = data['Fare'].fillna(data[(data['Sex'] == 'male') & (data['Pclass'] == 3)]['Fare'].dropna().median())
data['fFare'] = data['Fare'].apply(lambda x: .01 if (x == 0 or pd.isnull(x)) else x)
data['zFare'] = data['Fare'].apply(lambda x: x == 0).astype(bool) # I'm guessing these people were comp'd, may be important!
print(data['fFare'].describe())
print(data[data['zFare']]['fFare'])
count 1309.000000 mean 33.270181 std 51.746974 min 0.010000 25% 7.895800 50% 14.454200 75% 31.275000 max 512.329200 dtype: float64 PassengerId 180 0.01 264 0.01 272 0.01 278 0.01 303 0.01 414 0.01 467 0.01 482 0.01 598 0.01 634 0.01 675 0.01 733 0.01 807 0.01 816 0.01 823 0.01 1158 0.01 1264 0.01 Name: fFare, dtype: float64
# fill in empty cells
data['fSibSp'] = data['SibSp'].fillna(data['SibSp'].dropna().median())
data['fParch'] = data['Parch'].fillna(data['Parch'].dropna().median())
print(data[['fSibSp', 'fParch']].describe())
fSibSp fParch count 1309.000000 1309.000000 mean 0.498854 0.385027 std 1.041658 0.865560 min 0.000000 0.000000 25% 0.000000 0.000000 50% 0.000000 0.000000 75% 1.000000 0.000000 max 8.000000 9.000000
# For now let's just use a simple class model to fill in missing Ages.
avg_age_dict = data.groupby(['Pclass', 'Sex']).mean()['Age'].to_dict()
data['avg_Age'] = [avg_age_dict[(c, s)] for c, s in
zip(data['Pclass'], data['Sex'])]
data['fAge'] = data['Age'].fillna(data['avg_Age'])
#Also fix any zeros:
data['fAge'] = data['fAge'].apply(lambda x: 1 if x == 0 else x)
print(data.apply(is_bad).sum())
Age 263 Cabin 1014 Embarked 0 Fare 1 Name 0 Parch 0 PassengerId 0 Pclass 0 Sex 0 SibSp 0 Survived 418 Ticket 0 fFare 0 zFare 0 fSibSp 0 fParch 0 avg_Age 0 fAge 0 dtype: int64
Since we think age is pretty important, let's use a random forest to predict it based on as many of the other variables as possible.
First I'm going to go about extracting even more data from some of the variables as well as get them in the right form to use.
# First, we need to convert categorical variables to integer form
cat_map = {}
cat_cols = [col for col, t in data.dtypes.to_dict().items() if t == np.dtype('O')]
def factorize(data, cat_cols, cat_map=cat_map):
"""Convenience function for making categorical Series from string Series
inputs: dataframe, list of string categorical columns
outputs: dataframe, cat_map
"""
for col in cat_cols:
temp, rev = pd.factorize(data[col])
# temp = pd.Categorical.from_array(data[col])
# print(temp)
# print(rev)
cat_map['_' + col] = pd.Factor(temp, rev)
data['_' + col] = temp
return data, cat_map
data, cat_map = factorize(data, cat_cols)
print(data.apply(is_bad).sum())
print([c == np.dtype('O') for c in data.dtypes])
Age 263 Cabin 1014 Embarked 0 Fare 1 Name 0 Parch 0 PassengerId 0 Pclass 0 Sex 0 SibSp 0 Survived 418 Ticket 0 fFare 0 zFare 0 fSibSp 0 fParch 0 avg_Age 0 fAge 0 _Name 0 _Embarked 0 _Sex 0 _Ticket 0 _Cabin 0 dtype: int64 [False, True, True, False, True, False, False, False, True, False, False, True, False, False, False, False, False, False, False, False, False, False, False]
//anaconda/python.app/Contents/lib/python2.7/site-packages/pandas/core/categorical.py:197: FutureWarning: Factor is deprecated. Use Categorical instead warn("Factor is deprecated. Use Categorical instead", FutureWarning)
# Now let's pick the columns to use
def is_too_many(df, col):
"Checks if there are too many categorical variables"
return len(df[col].drop_duplicates()) > 120
def find_clean_cols(data):
clean_features = set([col for col, t in data.dtypes.to_dict().items()
if (t != np.dtype('O') and
not any(is_bad(data[col])) and not
(is_too_many(data, col) and
t == np.dtype('int64') ))])
clean_features = clean_features - set(['avg_Age', 'Parch', 'SibSp', 'Age_guess', '_Age', 'fAge', 'sqrt_fare', 'log_fare', 'emp prediction', 'Prediction', 'sqrt_age', 'log_age', 'Prediction %', 'survival chance', 's_Age', 'Best LLRP', 'Best AIC', 'Best BIC'])
clean_features = clean_features - set(['sttl_Miss', 'sttl_Mr', 'Tick_pref', 'sttl_Lady', 'sttl_Dr', 'sttl_Rev', 'sttl_Mst', 'sttl_Mrs', 'sttl_Mlle'])
clean_features = list(clean_features)
return clean_features
clean_features = find_clean_cols(data)
print(clean_features)
# print(data.apply(is_bad).sum()[clean_features])
['_Embarked', 'fSibSp', 'zFare', 'fParch', 'Pclass', 'fFare', '_Sex']
# Verify that our data is now clean
print(data.apply(is_bad).sum()[clean_features + ['_Age']])
_Embarked 0 fSibSp 0 zFare 0 fParch 0 Pclass 0 fFare 0 _Sex 0 _Age NaN dtype: float64
So now we have clean data to use!
In this section we'll look at the performance of two simple models: just using the most likely outcome of the gender, class tuple; and doing a very simple logistic regression.
We see that survival rates of females are a lot higher, and higher passenger classes also matter. Let's visualize the data.
Okay, so now we have a sense of the data and we know what we should expect. To start, let's first do a really simple logistic regression using just those variables.
# Now let's clean up the data for doing the regression
# We'll start simple with just a few columns
# Define some easy columns to use
simple_columns = []
# Create dummy variables
# I use .ix[:, :-1] to avoid multicollinearity / dummy variable trap
sex_dummies = pd.get_dummies(data['Sex'], prefix='sex').ix[:, :-1]
class_dummies = pd.get_dummies(data['Pclass'], prefix='class').ix[:, :-1]
embark_dummies = pd.get_dummies(data['Embarked'], prefix='embarked').ix[:, :-1]
bin_cols = list(sex_dummies.columns.values) + list(class_dummies.columns.values) + list(embark_dummies.columns.values)
# merge them together. Do it conditionally so that this cell is idempotent
for col in bin_cols:
if col in data.columns.values:
data = data.drop(col, 1)
data = data.join(sex_dummies, how='inner').join(class_dummies, how='inner').join(embark_dummies, how='inner')
# now we'll add an intercept
data['const'] = 1
# Check for bad data in the training set
print(data.ix[real_train, ['const', 'Survived'] + bin_cols].apply(is_bad).sum())
const 0 Survived 0 sex_female 0 class_1 0 class_2 0 embarked_C 0 embarked_Q 0 dtype: int64
# Let's pick the columns we'll use for regression and quickly look at their values
simple_reg_columns = bin_cols + simple_columns
simple_reg_columns = ['const'] + simple_reg_columns
print(data.ix[real_train, simple_reg_columns].describe())
const sex_female class_1 class_2 embarked_C embarked_Q count 891 891.000000 891.000000 891.000000 891.000000 891.000000 mean 1 0.352413 0.242424 0.206510 0.188552 0.086420 std 0 0.477990 0.428790 0.405028 0.391372 0.281141 min 1 0.000000 0.000000 0.000000 0.000000 0.000000 25% 1 0.000000 0.000000 0.000000 0.000000 0.000000 50% 1 0.000000 0.000000 0.000000 0.000000 0.000000 75% 1 1.000000 0.000000 0.000000 0.000000 0.000000 max 1 1.000000 1.000000 1.000000 1.000000 1.000000
Okay, so things look good so far. Now let's do an actual regression!
# We'll use statsmodels for the logistic regression
import statsmodels.api as sm
# Let's pick the columns we'll use for regression
simple_reg_columns = bin_cols + simple_columns
simple_reg_columns = ['const'] + simple_reg_columns
log_reg_model = sm.Logit(data.ix[real_train,'Survived'], data.ix[real_train, simple_reg_columns])
log_reg_result = log_reg_model.fit()
print(log_reg_result.summary())
Optimization terminated successfully. Current function value: 0.459610 Iterations 6 Logit Regression Results ============================================================================== Dep. Variable: Survived No. Observations: 891 Model: Logit Df Residuals: 885 Method: MLE Df Model: 5 Date: Sun, 02 Mar 2014 Pseudo R-squ.: 0.3098 Time: 13:36:27 Log-Likelihood: -409.51 converged: True LL-Null: -593.33 LLR p-value: 2.797e-77 ============================================================================== coef std err z P>|z| [95.0% Conf. Int.] ------------------------------------------------------------------------------ const -2.4049 0.174 -13.821 0.000 -2.746 -2.064 sex_female 2.6142 0.185 14.099 0.000 2.251 2.978 class_1 1.8472 0.224 8.236 0.000 1.408 2.287 class_2 1.1698 0.227 5.143 0.000 0.724 1.616 embarked_C 0.5914 0.228 2.595 0.009 0.145 1.038 embarked_Q 0.4483 0.316 1.420 0.156 -0.171 1.067 ==============================================================================
# Let's do it with regularization!
import statsmodels.api as sm
# Set regularization penalty
alpha = 0.01 * len(data.ix[real_train]) * np.ones(data.ix[real_train, simple_reg_columns].shape[1])
alpha[0] = 0 # don't regularize the constant
# print(alpha)
log_reg_result_reg = log_reg_model.fit_regularized(method='l1', alpha=alpha)
print(log_reg_result_reg.summary())
Optimization terminated successfully. (Exit mode 0) Current function value: 0.511942131596 Iterations: 32 Function evaluations: 32 Gradient evaluations: 32 Logit Regression Results ============================================================================== Dep. Variable: Survived No. Observations: 891 Model: Logit Df Residuals: 886 Method: MLE Df Model: 4 Date: Sun, 02 Mar 2014 Pseudo R-squ.: 0.2952 Time: 13:36:27 Log-Likelihood: -418.16 converged: True LL-Null: -593.33 LLR p-value: 1.480e-74 ============================================================================== coef std err z P>|z| [95.0% Conf. Int.] ------------------------------------------------------------------------------ const -1.8412 0.141 -13.084 0.000 -2.117 -1.565 sex_female 2.2744 0.169 13.485 0.000 1.944 2.605 class_1 1.2895 0.203 6.338 0.000 0.891 1.688 class_2 0.5178 0.208 2.486 0.013 0.110 0.926 embarked_C 0.1813 0.215 0.843 0.399 -0.240 0.603 embarked_Q 0 nan nan nan nan nan ==============================================================================
def viz_sm_coefs(result_obj):
params = result_obj.params.copy()
params.sort()
plt.figure(figsize(15, 6))
fig, axes = plt.subplots(nrows=1, ncols=2)
params.plot(kind='bar', ax=axes[0])
axes[0].set_title('Regression Coefficients')
params_abs = params.apply(abs)
params_abs.sort()
params_abs.plot(kind='bar', ax=axes[1])
axes[1].set_title('Abs(Regression Coefficients)')
return None
viz_sm_coefs(log_reg_result)
show()
<matplotlib.figure.Figure at 0x1082ad550>
//anaconda/python.app/Contents/lib/python2.7/site-packages/matplotlib/font_manager.py:1236: UserWarning: findfont: Font family ['normal'] not found. Falling back to Bitstream Vera Sans (prop.get_family(), self.defaultFamily[fontext]))
So we've not completed the logistic regression, and as expected, it looks like pretty much all the variables are significant and in the expected direction (females more likely to survive; lower classes are more likely to die).
But this is hard to interpret. So let's look at the survival prediction rates.
# Predict results
# Now let's see if they would survive!
data['survival chance'] = 0 # can help to initialize for all values
data['survival chance'] = log_reg_result.predict(data[simple_reg_columns])
# print(data.ix[real_train, 'survival chance'])
print(pd.crosstab(data.ix[real_train, 'Sex'], data.ix[real_train, 'Pclass'], values=data.ix[real_train, 'survival chance'], aggfunc=np.mean))
Pclass 1 2 3 Sex female 0.908633 0.807757 0.598604 male 0.414660 0.237134 0.094528
So as we expected, we see that females in first class have the highest survival chance at just over 90% whereas males in third class have less than a 10% survival rate. Let's compare this to the means again:
# Let's compare to the realized values
empirical_survival_rates = pd.crosstab(data.ix[real_train, 'Sex'], data.ix[real_train, 'Pclass'], values=data.ix[real_train, 'Survived'], aggfunc=np.mean)
print(empirical_survival_rates)
Pclass 1 2 3 Sex female 0.968085 0.921053 0.500000 male 0.368852 0.157407 0.135447
# And let's make sure we have enough data points for this to be meaningful
print(pd.crosstab(data.ix[real_train, 'Sex'], data.ix[real_train, 'Pclass']))
Pclass 1 2 3 Sex female 94 76 144 male 122 108 347
Okay, so that was cool, but as we showed above... we haven't really done anything particularly awesome yet. In fact, you could argue that we'd actually be better of just going with the empircal survival rates instead of the results of the logistic regression (maybe or maybe not given the low sample size). So let's add some more possibly informative covariates, like age, siblings, and parents and then inspect the results.
# add more data!
more_cols = ['fAge', 'fFare', 'fSibSp', 'fParch']
print(data[simple_reg_columns + more_cols].describe())
const sex_female class_1 class_2 embarked_C embarked_Q \ count 1309 1309.000000 1309.000000 1309.000000 1309.000000 1309.000000 mean 1 0.355997 0.246753 0.211612 0.206264 0.093965 std 0 0.478997 0.431287 0.408607 0.404777 0.291891 min 1 0.000000 0.000000 0.000000 0.000000 0.000000 25% 1 0.000000 0.000000 0.000000 0.000000 0.000000 50% 1 0.000000 0.000000 0.000000 0.000000 0.000000 75% 1 1.000000 0.000000 0.000000 0.000000 0.000000 max 1 1.000000 1.000000 1.000000 1.000000 1.000000 fAge fFare fSibSp fParch count 1309.000000 1309.000000 1309.000000 1309.000000 mean 29.376186 33.270181 0.498854 0.385027 std 13.169015 51.746974 1.041658 0.865560 min 0.170000 0.010000 0.000000 0.000000 25% 22.000000 7.895800 0.000000 0.000000 50% 26.000000 14.454200 0.000000 0.000000 75% 37.000000 31.275000 1.000000 0.000000 max 80.000000 512.329200 8.000000 9.000000
# Let's even use some transformations of some covariates!
data['log_fare'] = data['fFare'].apply(np.log)
data['sqrt_fare'] = data['fFare'].apply(np.sqrt)
data['log_age'] = data['fAge'].apply(np.log)
data['child'] = data['fAge'].apply(lambda x: 1 if x < 18 else 0)
data['sqrt_age'] = data['fAge'].apply(np.sqrt)
data['fam_size'] = data['fSibSp'] + data['fParch']
more_cols = ['fAge', 'fFare', 'fSibSp', 'fParch'] + ['log_fare', 'log_age', 'sqrt_fare', 'sqrt_age', 'fam_size', 'child']
for col in more_cols:
if col not in data.columns:
data = data.join(data.ix[real_train, col])
print(data[more_cols].describe())
# print(data.head())
fAge fFare fSibSp fParch log_fare \ count 1309.000000 1309.000000 1309.000000 1309.000000 1309.000000 mean 29.376186 33.270181 0.498854 0.385027 2.845042 std 13.169015 51.746974 1.041658 0.865560 1.292548 min 0.170000 0.010000 0.000000 0.000000 -4.605170 25% 22.000000 7.895800 0.000000 0.000000 2.066331 50% 26.000000 14.454200 0.000000 0.000000 2.670985 75% 37.000000 31.275000 1.000000 0.000000 3.442819 max 80.000000 512.329200 8.000000 9.000000 6.238967 log_age sqrt_fare sqrt_age fam_size child count 1309.000000 1309.000000 1309.000000 1309.000000 1309.000000 mean 3.224335 4.907291 5.255172 0.883881 0.117647 std 0.704259 3.032441 1.326913 1.583639 0.322313 min -1.771957 0.100000 0.412311 0.000000 0.000000 25% 3.091042 2.809947 4.690416 0.000000 0.000000 50% 3.258097 3.801868 5.099020 0.000000 0.000000 75% 3.610918 5.592406 6.082763 1.000000 0.000000 max 4.382027 22.634690 8.944272 10.000000 1.000000
print(data[['log_fare', 'log_age', 'sqrt_fare', 'sqrt_age', 'fam_size', 'child']].dtypes)
print(data[['log_fare', 'log_age', 'sqrt_fare', 'sqrt_age', 'fam_size', 'child']].apply(is_bad).sum())
log_fare float64 log_age float64 sqrt_fare float64 sqrt_age float64 fam_size int64 child int64 dtype: object log_fare 0 log_age 0 sqrt_fare 0 sqrt_age 0 fam_size 0 child 0 dtype: int64
That's great! Unfortunately, a lot of ML algorithms are sensitive to the scale of the variables. We have to normalize the variables in order to get around this.
# Rescale (demean, unit variance) quantitative columns
from sklearn.preprocessing import StandardScaler
scale_transforms = {}
def make_standard_scale(series, name):
"""Helper function to demean and transform a series to unit variance
inputs: series, name for outputing series
outputs: transformed series
"""
temp_scaler = StandardScaler().fit(series)
scale_transforms[name] = temp_scaler
result_series = temp_scaler.transform(series.copy())
return result_series
scale_cols = []
for col in [col for col, t in data[more_cols].dtypes.to_dict().items()
if t == np.dtype('float64')]:
print(col)
data['s' + col] = make_standard_scale(data[col], col)
scale_cols.append('s' + col)
print(data[scale_cols].head())
sqrt_fare sqrt_age fFare log_fare fAge log_age ssqrt_fare ssqrt_age sfFare slog_fare sfAge slog_age PassengerId 1 -0.730618 -0.425779 -0.503027 -0.668734 -0.560331 -0.189338 2 1.166388 0.685493 0.734877 1.100280 0.655107 0.587014 3 -0.690188 -0.117726 -0.489978 -0.599835 -0.256471 0.047958 4 0.785042 0.498269 0.383354 0.872359 0.427212 0.470196 5 -0.682892 0.498269 -0.487561 -0.587723 0.427212 0.470196
# Run the logistic regression. First set the columns
alpha_cols = simple_reg_columns + scale_cols
# Now actually run the regression
more_model = sm.Logit(data.ix[real_train, 'Survived'], data.ix[real_train, alpha_cols])
alpha_more = 0.01 * len(data.ix[real_train]) * np.ones(data.ix[real_train, alpha_cols].shape[1])
alpha_more[0] = 0
more_results = more_model.fit_regularized(method='l1', alpha=alpha_more, n_jobs=4)
print(more_results.summary())
Optimization terminated successfully. (Exit mode 0) Current function value: 0.501243434109 Iterations: 54 Function evaluations: 54 Gradient evaluations: 54 Logit Regression Results ============================================================================== Dep. Variable: Survived No. Observations: 891 Model: Logit Df Residuals: 883 Method: MLE Df Model: 7 Date: Sun, 02 Mar 2014 Pseudo R-squ.: 0.3173 Time: 13:36:29 Log-Likelihood: -405.06 converged: True LL-Null: -593.33 LLR p-value: 2.559e-77 ============================================================================== coef std err z P>|z| [95.0% Conf. Int.] ------------------------------------------------------------------------------ const -1.8294 0.156 -11.721 0.000 -2.135 -1.523 sex_female 2.1890 0.173 12.651 0.000 1.850 2.528 class_1 1.3665 0.269 5.072 0.000 0.838 1.894 class_2 0.5445 0.213 2.552 0.011 0.126 0.963 embarked_C 0.0841 0.222 0.378 0.705 -0.352 0.520 embarked_Q 0 nan nan nan nan nan ssqrt_fare 0 nan nan nan nan nan ssqrt_age 0 nan nan nan nan nan sfFare 0.0176 0.121 0.145 0.885 -0.220 0.255 slog_fare 0.1485 0.127 1.172 0.241 -0.100 0.397 sfAge 0 nan nan nan nan nan slog_age -0.3128 0.084 -3.739 0.000 -0.477 -0.149 ==============================================================================
# Visualize coefficients
viz_sm_coefs(more_results)
<matplotlib.figure.Figure at 0x106a0f150>
# Let's do a likelihood ratio test:
llr = -2 * (log_reg_result.llf - more_results.llf)
df_diff = more_results.df_model - log_reg_result.df_model
from scipy.stats import chi2
print(1-chi2.cdf(llr, df_diff))
0.0116586389642
So it looks like adding more data is helpful!
A more convincing analysis would be to repeat the above work, but this time use cross-validation. Let's do that.
# First let's pick a training set at random and repeat the crosstab approach
import random
random.seed(0)
train = random.sample(data.ix[real_train].index, len(data.ix[real_train].index)/2)
test = data.index.drop(list(train) + list(real_test))
# Now let's calculate performance!
# I'll make a helper function that we can re-use later to compare performance of
# multiple models
from sklearn.metrics import confusion_matrix, classification_report
def perf_report(df, true_class, pred_class, string_title):
"""
Prints the confusion matrix and precision, recall of the classifer
inputs: df, true_class, pred_class, string_title
outputs: confustion matrix, class_report
"""
print(string_title + ' Confusion Matrix')
conf_matrix = confusion_matrix(df[true_class], df[pred_class])
print(conf_matrix)
print(string_title + ' Performance Summary')
class_report = classification_report(np.array(df[true_class]),
np.array(df[pred_class]),
target_names=['Died', 'Survived'])
print(class_report)
print(string_title + ' accuracy: ',
float(np.diag(conf_matrix).sum())/float(conf_matrix.sum().sum()))
return conf_matrix, class_report
data['emp prediction'] = 0
female = data['Sex'] == 'female'
high_class = data['Pclass'] != 3
data.ix[female & high_class, 'emp prediction'] = 1
perf_report(data.ix[real_train], 'Survived', 'emp prediction', 'emp prediction')
print()
emp prediction Confusion Matrix [[540 9] [181 161]] emp prediction Performance Summary precision recall f1-score support Died 0.75 0.98 0.85 549 Survived 0.95 0.47 0.63 342 avg / total 0.82 0.79 0.77 891 ('emp prediction accuracy: ', 0.7867564534231201) ()
# let's look at the perf report for the best performing regressions:
def make_pred(result_model, input_df, output_df, ix, title, cols=None):
"""
Makes a series prediction
inputs: result_model, input_df, output_df, ix, title
outputs: outputdf[title]
"""
if title not in list(output_df.columns):
output_df[title] = 0
if cols == None:
cols = list(sig_cols[result_model.ix])
input_data = input_df.ix[ix, cols]
output_df.ix[ix, title] = result_model.predict(input_data)
output_df.ix[ix, title] = output_df.ix[ix, title].apply(lambda x: 1 if x > 0.5 else 0)
# print(output_df[title])
return output_df[title]
data['log simple'] = log_reg_result.predict(data[simple_reg_columns])
data['log simple'] = data['log simple'].apply(lambda x: 1 if x > 0.5 else 0)
data['log more'] = more_results.predict(data[alpha_cols])
data['log more'] = data['log more'].apply(lambda x: 1 if x > 0.5 else 0)
perf_report(data.ix[test], 'Survived', 'log simple', 'log simple')
perf_report(data.ix[test], 'Survived', 'log more', 'log more')
perf_report(data.ix[test], 'Survived', 'emp prediction', 'emp rediction')
print()
log simple Confusion Matrix [[221 51] [ 39 135]] log simple Performance Summary precision recall f1-score support Died 0.85 0.81 0.83 272 Survived 0.73 0.78 0.75 174 avg / total 0.80 0.80 0.80 446 ('log simple accuracy: ', 0.7982062780269058) log more Confusion Matrix [[229 43] [ 44 130]] log more Performance Summary precision recall f1-score support Died 0.84 0.84 0.84 272 Survived 0.75 0.75 0.75 174 avg / total 0.80 0.80 0.80 446 ('log more accuracy: ', 0.804932735426009) emp rediction Confusion Matrix [[265 7] [ 83 91]] emp rediction Performance Summary precision recall f1-score support Died 0.76 0.97 0.85 272 Survived 0.93 0.52 0.67 174 avg / total 0.83 0.80 0.78 446 ('emp rediction accuracy: ', 0.7982062780269058) ()
So we see that using the logistic regression does in fact out perform simply using the class frequencies. That being said, we might think that we could do even better.
Cool, so that did alright. Now let's use the best performing classifier to predict the future outcomes.
# Make a grid of 1 to max_features, 1 to max_depth; fix all possible variants
from sklearn.metrics import accuracy_score
scaled_cols = ['sfAge', 'fSibSp', 'fParch', 'slog_fare', 'slog_age', 'ssqrt_age', 'ssqrt_fare', 'fam_size']
quant_cols = ['sfAge', 'fSibSp', 'fParch', 'fam_size', 'sfFare']
bin_cols = ['_Sex', 'Pclass', '_Embarked']
dummy_cols = (
# list(fm_id_dummies.columns.values) +
list(sex_dummies.columns.values) +
list(class_dummies.columns.values) +
list(embark_dummies.columns.values))
print(data.ix[real_train, scaled_cols + dummy_cols].apply(is_bad).sum())
sfAge 0 fSibSp 0 fParch 0 slog_fare 0 slog_age 0 ssqrt_age 0 ssqrt_fare 0 fam_size 0 sex_female 0 class_1 0 class_2 0 embarked_C 0 embarked_Q 0 dtype: int64
# WARNING! THIS TAKES ABOUT 15 MINUTES TO RUN ON MY COMPUTER!!!
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
n_features = list(range(1, len(bin_cols + quant_cols)+1))
depths = list(range(1, len(bin_cols + quant_cols)+1))
param_grid = {'n_estimators': [50], 'criterion': ['gini', 'entropy'],
'max_features': n_features, 'max_depth': depths,
'oob_score': [True]}
rf_cv_clf = GridSearchCV(RandomForestClassifier(),
param_grid, scoring='accuracy',
refit=True,
verbose=1, cv=3, n_jobs=4)
rf_cv_clf.fit(
data.ix[real_train, bin_cols + quant_cols],
data.ix[real_train, 'Survived'])
Fitting 3 folds for each of 128 candidates, totalling 384 fits
[Parallel(n_jobs=4)]: Done 1 jobs | elapsed: 0.2s [Parallel(n_jobs=4)]: Done 50 jobs | elapsed: 2.5s [Parallel(n_jobs=4)]: Done 200 jobs | elapsed: 10.9s [Parallel(n_jobs=4)]: Done 378 out of 384 | elapsed: 23.5s remaining: 0.4s [Parallel(n_jobs=4)]: Done 384 out of 384 | elapsed: 23.8s finished
GridSearchCV(cv=3, estimator=RandomForestClassifier(bootstrap=True, compute_importances=None, criterion='gini', max_depth=None, max_features='auto', min_density=None, min_samples_leaf=1, min_samples_split=2, n_estimators=10, n_jobs=1, oob_score=False, random_state=None, verbose=0), fit_params={}, iid=True, loss_func=None, n_jobs=4, param_grid={'n_estimators': [50], 'max_features': [1, 2, 3, 4, 5, 6, 7, 8], 'oob_score': [True], 'criterion': ['gini', 'entropy'], 'max_depth': [1, 2, 3, 4, 5, 6, 7, 8]}, pre_dispatch='2*n_jobs', refit=True, score_func=None, scoring='accuracy', verbose=1)
rf_grid_val_scores = [tup.mean_validation_score for tup in rf_cv_clf.grid_scores_]
print(rf_cv_clf.best_params_)
print(rf_cv_clf.best_score_)
{'max_features': 4, 'n_estimators': 50, 'oob_score': True, 'criterion': 'gini', 'max_depth': 7} 0.833894500561
print(rf_cv_clf.best_estimator_.oob_score_)
0.821548821549
grid_std_scores = [np.std(tup.cv_validation_scores) for tup in rf_cv_clf.grid_scores_]
print('grid std scores: \n%s' % grid_std_scores[0:10])
grid std scores: [0.0027491467371303781, 0.021820675752215003, 0.0027491467371304236, 0.0041993910064803139, 0.010408101566212915, 0.010408101566212915, 0.010408101566212915, 0.010408101566212915, 0.012697764869792095, 0.011445610580455219]
# Visualize performance across two dimensions
# Use grid_scores_.parameters, loop through results - may want to abstract the loop into a helper function
def cv_grid_viz(cv_clf, default_params_dict,
x_param, y_param, title, x_exp=False, y_exp=False, markersize=500):
"""
Helper function to visualize a grid of CV results
inputs: cv_clf, default_params_dict, x_param, y_param, title
keyword inputs: x_exp=False, y_exp=False, markersize=500
outputs: x_list, y_list, acc_graph_list
side-effects: make matplotlib graph
"""
cv_params_list = [tup.parameters for tup in cv_clf.grid_scores_]
cv_scores = cv_grid_val_scores = [tup.mean_validation_score for tup in cv_clf.grid_scores_]
x_list = []
y_list = []
acc_graph_list = []
for params, score in zip(cv_params_list, cv_scores):
x_temp = None
y_temp = None
fail = False
for param_key in params:
if param_key in default_params_dict:
if default_params_dict[param_key] == params[param_key]:
continue
else:
fail = True
elif param_key == x_param:
x_temp = params[param_key]
elif param_key == y_param:
y_temp = params[param_key]
else:
continue
if x_temp and y_temp and not fail:
x_list.append(x_temp)
y_list.append(y_temp)
acc_graph_list.append(score)
if x_exp:
x_list = [np.log(x) for x in x_list]
x_param = 'log(' + x_param + ')'
if y_exp:
y_list = [np.log(y) for y in y_list]
y_param = 'log(' + y_param + ')'
figsize(10, 8)
scatter(x_list, y_list, c=acc_graph_list, marker='o', s=markersize)
legend(loc='best')
xlabel(x_param)
ylabel(y_param)
suptitle(title)
colorbar()
show()
return x_list, y_list, acc_graph_list
cv_grid_viz(rf_cv_clf,
{'criterion':'entropy','oob_score':True,'n_estimators':50},
'max_features', 'max_depth', 'RF Parameter Tuning (CV Accuracy)')
print()
//anaconda/python.app/Contents/lib/python2.7/site-packages/matplotlib/axes.py:4747: UserWarning: No labeled objects found. Use label='...' kwarg on individual plots. warnings.warn("No labeled objects found. "
()
data['SK RF'] = rf_cv_clf.best_estimator_.predict(data[bin_cols + quant_cols])
# Save the file
import csv
def save_to_csv(fp, col):
"Helper function to save a column of the dataframe into the preferred kaggle format"
with open(fp, 'w') as f:
writer = csv.writer(f)
writer.writerow(['PassengerId', 'Survived'])
for pid, s in zip(data.ix[real_test, col].index.values, data.ix[real_test, col].values):
writer.writerow([pid, int(s)])
save_to_csv('sk_rf.csv', 'SK RF')
dummy_cols = (
# list(fm_id_dummies.columns.values) +
list(sex_dummies.columns.values) +
list(class_dummies.columns.values)
# + list(aCab.columns.values)
)
print(dummy_cols)
['sex_female', 'class_1', 'class_2']
# Train an SVM classifier
from sklearn.svm import SVC
rbf_param_grid = {'C': list(np.exp(range(-10,10))), 'kernel': ['rbf'], 'gamma': list(np.exp(range(-10,10)))}
svm_clf = GridSearchCV(SVC(), param_grid=rbf_param_grid,
scoring='accuracy',
refit=True,
verbose=1, cv=3, n_jobs=4)
lscale_cols = ['slog_age', 'slog_fare']
svm_clf.fit(data.ix[real_train, lscale_cols + dummy_cols], data.ix[real_train, 'Survived'])
Fitting 3 folds for each of 400 candidates, totalling 1200 fits
[Parallel(n_jobs=4)]: Done 1 jobs | elapsed: 0.1s [Parallel(n_jobs=4)]: Done 50 jobs | elapsed: 0.6s [Parallel(n_jobs=4)]: Done 200 jobs | elapsed: 2.4s [Parallel(n_jobs=4)]: Done 450 jobs | elapsed: 4.8s [Parallel(n_jobs=4)]: Done 800 jobs | elapsed: 8.8s [Parallel(n_jobs=4)]: Done 1200 out of 1200 | elapsed: 1.0min finished
GridSearchCV(cv=3, estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0, kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False), fit_params={}, iid=True, loss_func=None, n_jobs=4, param_grid={'kernel': ['rbf'], 'C': [4.5399929762484854e-05, 0.00012340980408667956, 0.00033546262790251185, 0.00091188196555451624, 0.0024787521766663585, 0.006737946999085467, 0.018315638888734179, 0.049787068367863944, 0.1353352832366127, 0.36787944117144233, 1.0, 2.7182818284590451, 7.3890560989... 148.4131591025766, 403.42879349273511, 1096.6331584284585, 2980.9579870417283, 8103.0839275753842]}, pre_dispatch='2*n_jobs', refit=True, score_func=None, scoring='accuracy', verbose=1)
# Visualize SVM performance across two dimensions
cv_grid_viz(svm_clf,
{'kernel':'rbf'},
'C', 'gamma', 'SVM Parameter Tuning (CV Accuracy)',
x_exp=True, y_exp=True, markersize=175)
print(svm_clf.best_score_)
print(svm_clf.best_estimator_)
0.824915824916 SVC(C=2.71828182846, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=2.71828182846, kernel=rbf, max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False)
# Make predictions and save
data['SVM Pred'] = svm_clf.best_estimator_.predict(data[lscale_cols + dummy_cols])
print(data['SVM Pred'].head())
save_to_csv('svm.csv', 'SVM Pred')
PassengerId 1 0 2 1 3 1 4 1 5 0 Name: SVM Pred, dtype: float64
## Logistic Regression revisited
from sklearn.linear_model import LogisticRegression
alpha_params = np.sqrt(np.exp(range(-25, 10, 1)))
C_list = [1/a for a in alpha_params]
parameter_grid = {'C' : C_list,
'penalty': ['l1', 'l2'],
'intercept_scaling': [1000.],
#'class_weight':['auto']
}
log_reg = LogisticRegression()
log_clf = GridSearchCV(log_reg, parameter_grid,
scoring='accuracy', refit=True,
verbose=1, cv=10, n_jobs=4)
log_clf.fit(data.ix[real_train, lscale_cols + dummy_cols], data.ix[real_train, 'Survived'])
Fitting 10 folds for each of 70 candidates, totalling 700 fits
[Parallel(n_jobs=4)]: Done 1 jobs | elapsed: 0.0s [Parallel(n_jobs=4)]: Done 50 jobs | elapsed: 0.1s [Parallel(n_jobs=4)]: Done 200 jobs | elapsed: 0.4s [Parallel(n_jobs=4)]: Done 450 jobs | elapsed: 0.9s [Parallel(n_jobs=4)]: Done 700 out of 700 | elapsed: 1.4s finished
GridSearchCV(cv=10, estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, penalty='l2', random_state=None, tol=0.0001), fit_params={}, iid=True, loss_func=None, n_jobs=4, param_grid={'penalty': ['l1', 'l2'], 'C': [268337.28652087448, 162754.79141900392, 98715.771010760494, 59874.141715197817, 36315.502674246636, 22026.465794806714, 13359.726829661871, 8103.0839275753833, 4914.7688402991344, 2980.9579870417283, 1808.0424144560632, 1096.6331584284585, 665.1416330443619...4, 0.030197383422318504, 0.018315638888734182, 0.011108996538242306], 'intercept_scaling': [1000.0]}, pre_dispatch='2*n_jobs', refit=True, score_func=None, scoring='accuracy', verbose=1)
cv_grid_viz(log_clf,
{'penalty':'l1','class_weight':'auto'},
'C', 'intercept_scaling', 'Logistic Regression Parameter Tuning (CV Accuracy)',
x_exp=True, markersize=175)
print(log_clf.best_score_)
print(log_clf.best_params_)
0.806958473625 {'penalty': 'l1', 'C': 2.7182818284590451, 'intercept_scaling': 1000.0}
data['SK SM'] = log_clf.predict(data.ix[:, lscale_cols + dummy_cols])
print(data['SK SM'].head())
perf_report(data.ix[real_train], 'Survived', 'SK SM', 'SK SM train')
save_to_csv('sk_logit.csv', 'SK SM')
print()
PassengerId 1 0 2 1 3 1 4 1 5 0 Name: SK SM, dtype: float64 SK SM train Confusion Matrix [[474 75] [ 97 245]] SK SM train Performance Summary precision recall f1-score support Died 0.83 0.86 0.85 549 Survived 0.77 0.72 0.74 342 avg / total 0.81 0.81 0.81 891 ('SK SM train accuracy: ', 0.8069584736251403) ()
We might as well train an Adaboost classifier while we're at it too.
# Train an adaboost classifier
from sklearn.ensemble import AdaBoostClassifier
ada_param_grid = {'n_estimators':[x*50 + 100 for x in range(1,11)],
'learning_rate':[x/100. for x in range(1,11)]}
ada_clf = GridSearchCV(AdaBoostClassifier(), param_grid=ada_param_grid,
scoring='accuracy',
refit=True,
verbose=1, cv=3, n_jobs=4)
ada_clf.fit(data.ix[real_train, bin_cols + quant_cols], data.ix[real_train, 'Survived'])
print('Done')
Fitting 3 folds for each of 100 candidates, totalling 300 fits
[Parallel(n_jobs=4)]: Done 1 jobs | elapsed: 0.7s [Parallel(n_jobs=4)]: Done 50 jobs | elapsed: 16.7s [Parallel(n_jobs=4)]: Done 200 jobs | elapsed: 1.1min [Parallel(n_jobs=4)]: Done 294 out of 300 | elapsed: 1.6min remaining: 1.9s [Parallel(n_jobs=4)]: Done 300 out of 300 | elapsed: 1.6min finished
Done
print(ada_clf.best_score_)
cv_grid_viz(ada_clf,
{},
'n_estimators', 'learning_rate', 'AdaBoost Parameter Tuning (CV Accuracy)')
print()
0.810325476992
()
# Try doing a long, staged fit
ada_clf_staged = AdaBoostClassifier(n_estimators=1000, learning_rate=0.05)
ada_clf_staged.fit(data.ix[train, bin_cols + quant_cols], data.ix[train, 'Survived'])
AdaBoostClassifier(algorithm='SAMME.R', base_estimator=DecisionTreeClassifier(compute_importances=None, criterion='gini', max_depth=1, max_features=None, min_density=None, min_samples_leaf=1, min_samples_split=2, random_state=None, splitter='best'), learning_rate=0.05, n_estimators=1000, random_state=None)
# Visualize training set performance to ensure convergence
staged_scores_train = ada_clf_staged.staged_score(data.ix[train, bin_cols + quant_cols], data.ix[train, 'Survived'])
staged_scores_test = ada_clf_staged.staged_score(data.ix[test, bin_cols + quant_cols], data.ix[test, 'Survived'])
plt.plot(list(staged_scores_train), c='blue')
plt.plot(list(staged_scores_test), c='red')
plt.ylabel('Accuracy')
plt.xlabel('n_estimators')
plt.title('AdaBoost Convergence Test')
plt.show()
print(ada_clf_staged.score(data.ix[test, bin_cols + quant_cols], data.ix[test, 'Survived']))
0.820627802691
# Save the results
data['ada'] = ada_clf.predict(data[bin_cols + quant_cols])
perf_report(data.ix[real_train], 'Survived', 'SK SM', 'SK SM train')
perf_report(data.ix[real_train], 'Survived', 'SVM Pred', 'SVM Pred train')
perf_report(data.ix[real_train], 'Survived', 'SK RF', 'SK RF train')
perf_report(data.ix[real_train], 'Survived', 'ada', 'AdaBoost train')
print()
SK SM train Confusion Matrix [[474 75] [ 97 245]] SK SM train Performance Summary precision recall f1-score support Died 0.83 0.86 0.85 549 Survived 0.77 0.72 0.74 342 avg / total 0.81 0.81 0.81 891 ('SK SM train accuracy: ', 0.8069584736251403) SVM Pred train Confusion Matrix [[507 42] [ 80 262]] SVM Pred train Performance Summary precision recall f1-score support Died 0.86 0.92 0.89 549 Survived 0.86 0.77 0.81 342 avg / total 0.86 0.86 0.86 891 ('SVM Pred train accuracy: ', 0.8630751964085297) SK RF train Confusion Matrix [[533 16] [ 74 268]] SK RF train Performance Summary precision recall f1-score support Died 0.88 0.97 0.92 549 Survived 0.94 0.78 0.86 342 avg / total 0.90 0.90 0.90 891 ('SK RF train accuracy: ', 0.898989898989899) AdaBoost train Confusion Matrix [[488 61] [ 96 246]] AdaBoost train Performance Summary precision recall f1-score support Died 0.84 0.89 0.86 549 Survived 0.80 0.72 0.76 342 avg / total 0.82 0.82 0.82 891 ('AdaBoost train accuracy: ', 0.8237934904601572) ()